pacman::p_load(ggplot2, dplyr, party, caret, corrplot)
products2017 <- read.csv("~/Documents/@/UBIQUM/DATAML/Week6/Profitability/Profitability/DATASETS/existingproductattributes2017.csv")
products2018 <- read.csv("~/Documents/@/UBIQUM/DATAML/Week6/Profitability/Profitability/DATASETS/existingproductattributesPast.csv")


#Remove Na
# ncol(is.na(products2017))
# summary(is.na(products2017))
# na.omit(products2017)
# summary(products2017)
# complete.cases(products2017)
# str(products2017)
# products2017 <-  products2017[complete.cases(products2017),]
# str(products2017)

#Remove the column with Na
products2017$BestSellersRank <- NULL

#Find patterns in the data

#Select only numeric values Just for the PLOTS and the CORRELATION MATRIX V.1
products2017Num <- products2017[c(-1,-2)]
str(products2017Num)
## 'data.frame':    80 obs. of  15 variables:
##  $ Price                : num  949 2250 399 410 1080 ...
##  $ x5StarReviews        : int  3 2 3 49 58 83 11 33 16 10 ...
##  $ x4StarReviews        : int  3 1 0 19 31 30 3 19 9 1 ...
##  $ x3StarReviews        : int  2 0 0 8 11 10 0 12 2 1 ...
##  $ x2StarReviews        : int  0 0 0 3 7 9 0 5 0 0 ...
##  $ x1StarReviews        : int  0 0 0 9 36 40 1 9 2 0 ...
##  $ PositiveServiceReview: int  2 1 1 7 7 12 3 5 2 2 ...
##  $ NegativeServiceReview: int  0 0 0 8 20 5 0 3 1 0 ...
##  $ Recommendproduct     : num  0.9 0.9 0.9 0.8 0.7 0.3 0.9 0.7 0.8 0.9 ...
##  $ ShippingWeight       : num  25.8 50 17.4 5.7 7 1.6 7.3 12 1.8 0.75 ...
##  $ ProductDepth         : num  23.9 35 10.5 15 12.9 ...
##  $ ProductWidth         : num  6.62 31.75 8.3 9.9 0.3 ...
##  $ ProductHeight        : num  16.9 19 10.2 1.3 8.9 ...
##  $ ProfitMargin         : num  0.15 0.25 0.08 0.08 0.09 0.05 0.05 0.05 0.05 0.05 ...
##  $ Volume               : int  12 8 12 196 232 332 44 132 64 40 ...
# Visualize the data. First approach 
# for (col in 1:ncol(products2017Num)) { 
#   hist(products2017Num[,col], main = names(products2017Num[col]), xlab=colnames(products2017Num[col]), border = "black", col = "#bcd4e6", breaks=50 )}
# 
# for (col in 1:ncol(products2017Num)) { 
#   boxplot(products2017Num[,col], main = names(products2017Num[col]), xlab=colnames(products2017Num[col]), border = "black", col = "#bcd4e6" )}




#Decision Tree 1
products2017_ctree <- ctree(Volume ~ ., data=products2017, controls = ctree_control(maxdepth = 4))

plot(products2017_ctree)

#Correlation Matrix
corrData <- cor(products2017Num)
corrplot(corrData)

#Select variables. A. Without 5StarReviews
products2017CombA <- products2017[c(-2, -4)]

#Decision Tree. A
products2017_ctreeA <- ctree(Volume ~ ., data=products2017CombA, controls = ctree_control(maxdepth = 4))
plot(products2017_ctreeA)

#Positive and 4StarReviews are the most important variables. 

#FEATURE ENGINEERING

#Join the Stars Attributes because they are very correlated

products2017CombMerg <- products2017
products2017CombMerg <- products2017[c(-4:-8)]
products2017CombMerg$GoodReviews <- products2017$x4StarReviews + products2017$x3StarReviews
products2017CombMerg$BadReviews <- products2017$x2StarReviews + products2017$x1StarReviews


#Decision Tree B
products2017_ctreeB <- ctree(Volume ~ ., data=products2017CombMerg, controls = ctree_control(maxdepth = 10))
plot(products2017_ctreeB)

#Select variables:: Feature Eng. B.2 Without 5StarReviews, Without Categorical Variables
products2017CombMerg2 <- products2017CombMerg[c(-1, -2)]

#Correlation Matrix B
corrDataMerg2 <- cor(products2017CombMerg2)
corrplot(corrDataMerg2)

# Visualize the data. B
for (col in 1:ncol(products2017CombMerg2)) {
  hist(products2017CombMerg2[,col], main = names(products2017CombMerg2[col]), xlab=colnames(products2017CombMerg2[col]), border = "black", col = "#bcd4e6", breaks=200 )}

#There are some outliers in PositiveServiceReview and in the GoodReviews and BadReviews.
#So let's analyze how is their distribution

GoodReviewsNums <- products2017CombMerg %>% group_by(GoodReviews) %>% summarise (n = n()) 
GoodReviewsOutliers <- products2017CombMerg %>% filter(GoodReviews == '35')

#Products from 134 to 141 are duplicated (the only difference between them is the price)
#So let's remove them

#Select Rows:: Feature Eng. C Without duplicated rows
products2017CombMergClean <- products2017CombMerg[-c(34:41),]

#Decision Tree C
products2017_ctreeC <- ctree(Volume ~ ., data=products2017CombMergClean, controls = ctree_control(maxdepth = 10))
plot(products2017_ctreeC)

#Creating dummy variables------
#Select variables:: Feature Eng. C.2 Without Categorical Variables
products2017CombMergClean2 <- products2017CombMergClean[c(-1, -2)]


# Visualize the data. C
for (col in 1:ncol(products2017CombMergClean2)) {
  hist(products2017CombMergClean2[,col], main = names(products2017CombMergClean2[col]), xlab=colnames(products2017CombMergClean2[col]), border = "black", col = "#bcd4e6", breaks=200 )}

ggplot(products2017CombMergClean, aes(x=ProductType, fill=Volume)) + geom_bar()

#Correlation Matrix C
corrDataMergClean2 <- cor(products2017CombMergClean2)
corrplot(corrDataMergClean2)

#detect outliers for 4StarReviews
x5StarNums <- products2017 %>% group_by(x5StarReviews) %>% summarise (n = n()) 
x5StarOutliers <- products2017 %>% filter(x5StarReviews > 1000) #Product 150, 198
x5StarOutliers2 <- products2017 %>% filter(x5StarReviews == '308' ) 
#Product 150, 198

x4StarOutliers <- products2017 %>% filter(x4StarReviews > 100)











#priceNums <- products2017 %>% group_by(Price) %>% summarise (n = n()) 
###the same but in different way >>> priceNums <-  summarise(group_by(products2017Num, Price), (n = n()))

NegativeNums  <- products2017 %>% group_by(NegativeServiceReview) %>% summarise (n = n()) 
PositiveNums <- products2017 %>% group_by(PositiveServiceReview) %>% summarise (n = n()) 

#detect outliers for PositiveServiceReview
positiveServiceOutliers <- products2017 %>% filter(PositiveServiceReview == '280')
#remove outliers 
#products2017 <- products2017[-c(36:41),]

#detect outliers for NegativeServiceReview
negativeServiceOutliers <- products2017 %>% filter(NegativeServiceReview == '112')

#remove outlier because is clearly more different in every attribute
#products2017 <- products2017[-c(23),]

hist(products2017$PositiveServiceReview)

# box_plot <- boxplot(products2017[, c("","")])
#sort(products2017Num$NegativeServiceReview)